[My Github link] https://github.com/sydneydlu98/DSI_Data_Challenge_5
In this Data Challenge, we will be clustering foods from the nndb_flat dataset provided on Canvas. To load/clean the data as well as perform some exploratory analysis:
## load all the packages
library(readr)
library(dplyr)
library(GGally)
library(tidyverse)
library(plotly)
library(ggplot2)
## read in data
data <- read_csv("nndb_flat.csv")
## filter the data to only contain food groups of Vegetables and Vegetable Products, Beef Products, and Sweets
object <-
c("Sweets", "Beef Products", "Vegetables and Vegetable Products")
clean_data <- data %>%
subset(FoodGroup %in% object)
var <- clean_data %>%
select(Energy_kcal:Zinc_mg)
GGally::ggcorr(
var,
size = 3.2,
label = TRUE,
label_size = 2.7,
hjust = .9,
layout.exp = 2
)
If the coefficient value lies between ± 0.50 and ± 1, then it is said to be a strong correlation. If the value is near ± 1, then it said to be a perfect correlation: as one variable increases, the other variable tends to also increase (if positive) or decrease (if negative).
We can see from the plot, we do not have any high negative correlation, but we do have many high positive correlation. Such as the correlation coefficient between Folate_mcg and Thiamin_mg is 1, which is perfect positive correlation, and others like Protein_g and Zinc_mg has a correlation coefficient of 0.9 which is considered high correlation.
Steps for performing the PCA on the data:
data_scaled <- scale(var)
pca_data <- prcomp(data_scaled,
center = FALSE,
scale. = FALSE)
summary(pca_data)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.3024 1.8455 1.6577 1.6205 1.41770 1.03167 0.9922
## Proportion of Variance 0.2305 0.1481 0.1195 0.1142 0.08739 0.04628 0.0428
## Cumulative Proportion 0.2305 0.3785 0.4980 0.6122 0.69960 0.74587 0.7887
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.90473 0.85456 0.8279 0.74550 0.71398 0.61241 0.58307
## Proportion of Variance 0.03559 0.03175 0.0298 0.02416 0.02216 0.01631 0.01478
## Cumulative Proportion 0.82426 0.85602 0.8858 0.90998 0.93214 0.94845 0.96323
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.46385 0.42317 0.38016 0.31599 0.27229 0.23596 0.22717
## Proportion of Variance 0.00935 0.00779 0.00628 0.00434 0.00322 0.00242 0.00224
## Cumulative Proportion 0.97258 0.98037 0.98665 0.99099 0.99422 0.99664 0.99888
## PC22 PC23
## Standard deviation 0.1440 0.07045
## Proportion of Variance 0.0009 0.00022
## Cumulative Proportion 0.9998 1.00000
var_explainded <- summary(pca_data)$importance[2, ]
cumulative <- cumsum(var_explainded)
var_explained_df <- data.frame(PC = 1:23,
var_explainded = var_explainded,
cum_var_explained = cumulative)
var_explained_df
var_explained_df %>%
ggplot(aes(x = PC, y = cum_var_explained, group = 1)) +
geom_point() +
geom_line(lwd = 1) +
labs(title = "Scree Plot: PCA",
y = 'Cumulative Variation Explained',
x = 'PC') +
ggtitle("Plot of cumulative proportion of the variation explained by each PC") +
theme(plot.title = element_text(hjust = 0.5, size = 16)) +
# add line for better visualization
geom_vline(xintercept = 3,
linetype = 2,
lwd = 1,
col = "red")
pca_loadings <- as.data.frame(pca_data$rotation) %>%
dplyr::select(PC1, PC2, PC3) %>%
mutate(variable = rownames(pca_data$rotation))
pc1 <- ggplot(pca_loadings, aes(x = variable,
y = abs(PC1))) +
geom_bar(stat = 'identity', fill = "#FF6666") +
theme(axis.text.x = element_text(
angle = 50,
hjust = 1,
size = 13
)) +
ggtitle("Plot for the loadings of PC1 for all of the variables") +
theme(plot.title = element_text(hjust = 0.5, size = 16)) +
labs(y = "Loadings", x = "Variables")
pc1
pc2 <- ggplot(pca_loadings, aes(x = variable,
y = abs(PC2))) +
geom_bar(stat = 'identity', fill = "darkgreen") +
theme(axis.text.x = element_text(
angle = 50,
hjust = 1,
size = 13
)) +
ggtitle("Plot for the loadings of PC2 for all of the variables") +
theme(plot.title = element_text(hjust = 0.5, size = 16)) +
labs(y = "Loadings", x = "Variables")
pc2
pc3 <- ggplot(pca_loadings, aes(x = variable,
y = abs(PC3))) +
geom_bar(stat = 'identity', fill = "blue") +
theme(axis.text.x = element_text(
angle = 50,
hjust = 1,
size = 13
)) +
ggtitle("Plot for the loadings of PC3 for all of the variables") +
theme(plot.title = element_text(hjust = 0.5, size = 16)) +
labs(y = "Loadings", x = "Variables")
pc3
pca_scores <- as.data.frame(pca_data$x)
pca_scores <- pca_scores %>%
mutate(FoodGroup = clean_data$FoodGroup) %>%
mutate(description = clean_data$ShortDescrip)
pca_scores
plot1 <- ggplot(pca_scores,
aes(
x = PC1,
y = PC2,
col = FoodGroup,
label = description
)) +
geom_point() +
ggtitle("PC1 versus PC2") +
theme(plot.title = element_text(hjust = 0.5, size = 16))
ggplotly(plot1)
plot2 <- ggplot(pca_scores, aes(x = PC1,
y = PC3,
col = FoodGroup,
label = description)) +
geom_point() +
ggtitle("PC1 versus PC3") +
theme(plot.title = element_text(hjust = 0.5, size = 16))
ggplotly(plot2)
plot3 <- ggplot(pca_scores, aes(x = PC2,
y = PC3,
col = FoodGroup,
label = description)) +
geom_point() +
ggtitle("PC2 versus PC3") +
theme(plot.title = element_text(hjust = 0.5, size = 16))
ggplotly(plot3)
The major outlier on the plots above is yeast extract spread from the food group vegetable and vegatable products.
Then we remove this outlier.
## remove the outlier I identified above
complete_data <- clean_data %>%
filter(ShortDescrip != "YEAST EXTRACT SPREAD")
var_new <- complete_data %>%
select(Energy_kcal:Zinc_mg)
data_scaled_new <- scale(var_new)
pca_data_new <- prcomp(data_scaled_new,
center = FALSE,
scale. = FALSE)
summary(pca_data_new)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 2.4047 1.9174 1.6360 1.5452 1.05222 1.02786 0.92913
## Proportion of Variance 0.2514 0.1598 0.1164 0.1038 0.04814 0.04593 0.03753
## Cumulative Proportion 0.2514 0.4113 0.5276 0.6314 0.67959 0.72552 0.76306
## PC8 PC9 PC10 PC11 PC12 PC13 PC14
## Standard deviation 0.91925 0.8348 0.78281 0.76116 0.72114 0.70611 0.59976
## Proportion of Variance 0.03674 0.0303 0.02664 0.02519 0.02261 0.02168 0.01564
## Cumulative Proportion 0.79980 0.8301 0.85674 0.88193 0.90454 0.92621 0.94185
## PC15 PC16 PC17 PC18 PC19 PC20 PC21
## Standard deviation 0.58556 0.51769 0.43258 0.42121 0.35832 0.31552 0.27864
## Proportion of Variance 0.01491 0.01165 0.00814 0.00771 0.00558 0.00433 0.00338
## Cumulative Proportion 0.95676 0.96841 0.97655 0.98426 0.98985 0.99417 0.99755
## PC22 PC23
## Standard deviation 0.22669 0.07037
## Proportion of Variance 0.00223 0.00022
## Cumulative Proportion 0.99978 1.00000
var_explainded_new <- summary(pca_data_new)$importance[2,]
cumulative_new <- cumsum(var_explainded_new)
var_explained_df_new <- data.frame(PC = 1:23,
var_explainded = var_explainded_new,
cum_var_explained = cumulative_new)
var_explained_df_new
var_explained_df_new %>%
ggplot(aes(x = PC, y = cum_var_explained, group = 1)) +
geom_point() +
geom_line(lwd = 1) +
labs(title = "Scree Plot: PCA",
y = 'Cumulative Variation Explained',
x = 'PC') +
ggtitle("Plot of cumulative proportion of the variation explained by each PC \n (without outlier)") +
theme(plot.title = element_text(hjust = 0.5, size = 16)) +
# add line for better visualization
geom_vline(
xintercept = 3,
linetype = 2,
lwd = 1,
col = "red"
)
pca_loadings_new <- as.data.frame(pca_data_new$rotation) %>%
dplyr::select(PC1, PC2, PC3) %>%
mutate(variable = rownames(pca_data_new$rotation))
pc1_new <- ggplot(pca_loadings_new, aes(x = variable,
y = abs(PC1))) +
geom_bar(stat = 'identity', fill = "#FF6666") +
theme(axis.text.x = element_text(
angle = 50,
hjust = 1,
size = 13
)) +
ggtitle("Plot for the loadings of PC1 for all of the variables \n (without outlier)") +
theme(plot.title = element_text(hjust = 0.5, size = 16)) +
labs(y = "Loadings", x = "Variables")
pc1_new
pc2_new <- ggplot(pca_loadings_new, aes(x = variable,
y = abs(PC2))) +
geom_bar(stat = 'identity', fill = "darkgreen") +
theme(axis.text.x = element_text(
angle = 50,
hjust = 1,
size = 13
)) +
ggtitle("Plot for the loadings of PC2 for all of the variables \n (without outlier)") +
theme(plot.title = element_text(hjust = 0.5, size = 16)) +
labs(y = "Loadings", x = "Variables")
pc2_new
pc3_new <- ggplot(pca_loadings_new, aes(x = variable,
y = abs(PC3))) +
geom_bar(stat = 'identity', fill = "blue") +
theme(axis.text.x = element_text(
angle = 50,
hjust = 1,
size = 13
)) +
ggtitle("Plot for the loadings of PC3 for all of the variables \n (without outlier)") +
theme(plot.title = element_text(hjust = 0.5, size = 16)) +
labs(y = "Loadings", x = "Variables")
pc3_new
pca_scores_new <- as.data.frame(pca_data_new$x)
pca_scores_new <- pca_scores_new %>%
mutate(FoodGroup = complete_data$FoodGroup) %>%
mutate(description = complete_data$ShortDescrip)
pca_scores_new
plot1_new <- ggplot(pca_scores_new,
aes(
x = PC1,
y = PC2,
col = FoodGroup,
label = description
)) +
geom_point() +
ggtitle("PC1 versus PC2 (without outlier)") +
theme(plot.title = element_text(hjust = 0.5, size = 16))
ggplotly(plot1_new)
plot2_new <- ggplot(pca_scores_new,
aes(
x = PC1,
y = PC3,
col = FoodGroup,
label = description
)) +
geom_point() +
ggtitle("PC1 versus PC3 (without outlier)") +
theme(plot.title = element_text(hjust = 0.5, size = 16))
ggplotly(plot2_new)
plot3_new <- ggplot(pca_scores_new,
aes(
x = PC2,
y = PC3,
col = FoodGroup,
label = description
)) +
geom_point() +
ggtitle("PC2 versus PC3 (without outlier)") +
theme(plot.title = element_text(hjust = 0.5, size = 16))
ggplotly(plot3_new)